/* add mean and standard deviation of return measures, with health majors */
* add to the dataset industry_skew_major;

libname edu '!userprofile\\Dropbox\Education\Replication\Data'; 

DATA crsp;  /*read all the crsp monthly data with nonmissing return*/
set edu.crsp_data;
IF not missing (RET);
if RET ne 'C';
if RET ne 'B';
year=year(date);
month=month(date);
if 1<=EXCHCD<=3 and SHRCD in (10 11);
naics_3=compress(substr(naics,1,3));
sic_2=floor(siccd/100);
mktcap = abs(prc)*shrout;
run;

data crsp (drop = naics_3); set crsp; naics3 = input(naics_3, 12.); run;
data crsp (drop = naics3); set crsp; naics_3 = naics3; run;

/* get # employees, if there are duplicate entries, choose the larger and the most recent */
data emp (keep = lpermno datadate year emp); set edu.crspcompustat_data; year = year(datadate); run; 

proc sort data = emp; by lpermno year descending datadate descending emp; run;
proc sort data = emp nodupkey; by lpermno year; run;

proc sql;
  create table crsp as
  select distinct a.*, b.emp 
  from crsp as a left join emp as b
  on a.permno = b.lpermno and a.year = b.year;
quit;

/* map SIC2 and NAICS3 into major code (use SIC2 before 2005; NAICS3 afterwards) */
proc sql;
  create table temp as
  select a.*, b.major 
  from crsp as a left join edu.sic_2_major_bio as b
  on a.sic_2 = b.sic_2;
quit;

data temp;
  set temp;
  if year >= 2005 then delete;
run;

proc sql;
  create table temp2 as
  select a.*, b.major 
  from crsp as a left join edu.naics_3_major_bio as b
  on a.naics_3 = b.naics_3;
quit;

data temp2;
  set temp2;
  if year >= 2005;
run;

data crsp_major; set temp; run;
proc append base = crsp_major data = temp2; run;

data crsp_major; set crsp_major; if major = . then delete; run;

* we calculate measures only for stocks that have a valid emp;
proc sql;
  create table base as
  select distinct permno, year, emp, major
  from crsp_major;
quit;

data base; set base; if emp > 0; run;

* first create a large dataset that has 5 years of returns;
proc sql;
  create table return as
  select a.*, b.year as year2, b.month as month2, b.ret
  from base as a, crsp_major as b
  where a.permno = b.permno and 0 <= a.year - b.year <= 4;
quit;

* create major portfolios;
proc sort data = return; by year year2 month2 major; run;
proc means data = return noprint;
  var ret;
  weight emp;
  by year year2 month2 major;
  output out = major mean = ret;
run;

* calculate time-series mean and vol for major portfolios over 5 years or 2 years;
data temp; set major; if major = . then delete; run;
proc sort data = temp; by year major; run;

proc means data = temp noprint;
  by year major;
  var ret;
  output out = stat5major mean = tsmean5 std = tsvol5;
run;

data temptwo; set temp; if year - year2 >= 2 then delete; run;
proc means data = temptwo noprint;
  by year major;
  var ret;
  output out = stat2major mean = tsmean2 std = tsvol2;
run;

* add to permanent datasets;
proc sql;
  create table temp2 as
  select a.*, b.tsmean5, b.tsvol5
  from edu.industry_skew_major_bio as a left join stat5major as b
  on a.year = b.year and a.major = b.major; 
quit;

proc sql;
  create table temp2 as
  select a.*, b.tsmean2, b.tsvol2
  from temp2 as a left join stat2major as b
  on a.year = b.year and a.major = b.major; 
quit;

data edu.industry_skew_major_bio; set temp2; run;
